---
title: "Lex corpus analysis"
author: "Francesco Bailo"
date: "`r Sys.time()`"
output:
  pdf_document:
    latex_engine: xelatex
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(tidyverse)
library(dplyr)
library(readxl)
library(kableExtra)
library(cowplot)
ggplot2::theme_set(ggplot2::theme_bw())
```


## Data

```{r load}
corpus.df <- 
  readRDS("corpus_raw.rds") %>%
  dplyr::filter(!grepl("rouss", doc_id, ignore.case = T))

corpus_pdf.tidy <- 
  read.csv("corpus_pdf.tidy.csv")

corpus_docx.tidy <- 
  read.csv("corpus_docx.tidy.csv")

# LexRousseau

Dataset_unico_250622 <- 
  read_excel("Dataset_unico_250622.xlsx")

cos_sim_results.df <- read.csv("cos_sim_results.csv")

cos_sim_results.token <- 
  read.csv("cos_sim_results.token.csv")

```

* Total number of documents: `r nrow(corpus.df)`

* Total number of DOCX documents: `r sum(grepl("docx", corpus.df$doc_id, ignore.case = T))`

* Total number of PDF documents: `r sum(grepl("pdf", corpus.df$doc_id, ignore.case = T))`

* Total number of matched documents (DOCX in PDF): `r sum(unique(corpus_docx.tidy$doc_id) %in% unique(corpus_pdf.tidy$doc_id))` (`r round(sum(unique(corpus_docx.tidy$doc_id) %in% unique(corpus_pdf.tidy$doc_id)) / length(unique(corpus_docx.tidy$doc_id)) * 100, 2)`%)

* Total number of matched documents (PDF in DOCX): `r sum(unique(corpus_pdf.tidy$doc_id) %in% unique(corpus_docx.tidy$doc_id))` (`r round(sum(unique(corpus_pdf.tidy$doc_id) %in% unique(corpus_docx.tidy$doc_id)) / length(unique(corpus_pdf.tidy$doc_id)) * 100, 2)`%)

### Unmatched documents

#### DOCXs without a PDFs

```{r results = 'asis'}
corpus_docx.tidy %>%
  dplyr::filter(!doc_id %in% corpus_pdf.tidy$doc_id) %>%
  dplyr::distinct(doc_id) %>%
  dplyr::pull(doc_id) %>%
  paste(collapse = ", ") %>%
  cat()
```

#### PDFs without a DOCXs

```{r results = 'asis'}
corpus_pdf.tidy %>%
  dplyr::filter(!doc_id %in% corpus_docx.tidy$doc_id) %>%
  dplyr::distinct(doc_id) %>%
  dplyr::pull(doc_id) %>%
  paste(collapse = ", ") %>%
  cat()
```

## Method

1. Text was read into R with `readtext::readtext()` (Benoit & Obeng, 2023).

2. For PDF documents, text before a match with the regular expression `(DISEGNO|PROPOSTA) DI LEGGE` was removed. This excluded everything that is not part of the body of the bill. 

3. For DOCX documents, text before a match with the regular expression `"\\n(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1`, `. (Art.|Articolo|ART.|ARTICOLO)([ ]+)?1`, `^(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1`, `(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1` was removed.

4. The text was split into sentences with `tidytext::unnest_sentences()` (Silge & Robinson, 2016).

5. The text was pre-processed removing sentences that did not contain any letter (`[a-z]`) or that did not contain at least three words (`\\S+`).

6. For each DOCX document, every sentence was then compare to every sentence of the corresponding PDF document using the "SentenceTransformers"  framework (Reimers & Gurevych, 2019) and the "sentence-bert-base" sentence-transformers model for Italian (Edoardo Federici, 2022), calculating the cosine similarity of the two sentences.

## Examples 

```{r}

cos_sim_results_full_text <- 
  read.csv("cos_sim_results_full_text.csv") %>%
  dplyr::mutate(level = cut(score, breaks = 8))

```

### Examples: eight levels

```{r results = 'asis'}

cos_sim_results_full_text %>%
  group_by(level) %>%
  slice_sample(n = 2) %>%
  kbl(digits=2, format = 'latex', booktabs=T, longtable = T) %>%
  column_spec(2, width = "15em") %>%
  column_spec(3, width = "15em") %>%
  kable_styling(latex_options = c("striped", "repeat_header"))
```

### Examples: >.95 (high similarity)

```{r results = 'asis'}

cos_sim_results_full_text %>%
  dplyr::filter(score>.95) %>%
  sample_n(8) %>%
  kbl(digits=2, format = 'latex', booktabs=T, longtable = T) %>%
  column_spec(2, width = "15em") %>%
  column_spec(3, width = "15em") %>%
  kable_styling(latex_options = c("striped", "repeat_header"))
```

## Summary of results

```{r}

doc_summary_stats.df <- 
  cos_sim_results.df %>%
  dplyr::group_by(doc_id, i) %>%
  dplyr::summarise(max_cos_sim = max(score)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(doc_id) %>%
  dplyr::summarise(sentences = n(),
                   median_sim = median(max_cos_sim),
                   mean_sim = mean(max_cos_sim),
                   high_sim_n = sum(max_cos_sim > .95),
                   high_sim_perc = sum(max_cos_sim > .95) / n() * 100) %>%
  dplyr::inner_join(Dataset_unico_250622, by = c(doc_id = "ID_n"))

doc_summary_stats.df$token_sim <- 
  cos_sim_results.token$similarity[
    match(doc_summary_stats.df$doc_id,
          as.numeric(gsub(".pdf",
                          "",
                          cos_sim_results.token$item1)))] 

doc_summary_stats.df <- 
  doc_summary_stats.df %>%
  dplyr::mutate(Partito.n = factor(case_when(
    Partito %in% c("M5S", "M5S, On. Azzurra Pia Maria Cancelleri", 
                   "M5S, On. Paolo Bernini") ~ "M5S only",
    Partito %in% "M5S e altri" ~ "M5S e altri"),
    levels = c("M5S e altri", "M5S only")))

doc_summary_stats.df$date <- 
  as.Date(as.numeric(doc_summary_stats.df$`Data presentazione`),
          origin = "1899-12-30")

doc_summary_stats.df$date[
  doc_summary_stats.df$`Data presentazione` == "14 gennaio 2016"] <-
  as.Date("2016-01-14")

doc_summary_stats.df$assegnato_non_iniziato <- 
  doc_summary_stats.df$Esito %in% "Assegnato (non ancora iniziato l'esame)"

doc_summary_stats.df$bottom_5pct <- 
  doc_summary_stats.df$mean_sim <= quantile(doc_summary_stats.df$mean_sim, p = .05)

doc_summary_stats.df$bottom_10pct <- 
  doc_summary_stats.df$mean_sim <= quantile(doc_summary_stats.df$mean_sim, p = .1)

```

* Number of DOCX documents: `r nrow(doc_summary_stats.df)`

* Average proportion of high sentence-sentence similarity: `r round(mean(doc_summary_stats.df$high_sim_perc),2)`%

* Median proportion of high sentence-sentence similarity: `r round(median(doc_summary_stats.df$high_sim_perc),2)`%

* Proportion of DOCX with less than 20% of high-similarity sentences: `r sum(doc_summary_stats.df$high_sim_perc < 20)` (`r round(sum(doc_summary_stats.df$high_sim_perc < 20) / nrow(doc_summary_stats.df) * 100, 2)`)%

* Proportion of DOCX with more than 50% of high-similarity sentences: `r sum(doc_summary_stats.df$high_sim_perc > 50)` (`r round(sum(doc_summary_stats.df$high_sim_perc > 50) / nrow(doc_summary_stats.df) * 100, 2)`)%

* Proportion of DOCX with more than 80% of high-similarity sentences: `r sum(doc_summary_stats.df$high_sim_perc > 80)` (`r round(sum(doc_summary_stats.df$high_sim_perc > 80) / nrow(doc_summary_stats.df) * 100, 2)`)%

* Proportion of DOCX with more than 90% of high-similarity sentences: `r sum(doc_summary_stats.df$high_sim_perc > 90)` (`r round(sum(doc_summary_stats.df$high_sim_perc > 90) / nrow(doc_summary_stats.df) * 100, 2)`)%


* Mean sentence similarity (of most similar pairs): 

```{r}
summary(doc_summary_stats.df$mean_sim)
```

    * Mean: `r round(mean(doc_summary_stats.df$mean_sim), 2)`

    * Standard deviation: `r round(sd(doc_summary_stats.df$mean_sim), 2)`
    
    * Least similar 5%:
    
```{r}
round(quantile(doc_summary_stats.df$mean_sim, p = .05), 2)
```


```{r}
summary(doc_summary_stats.df$mean_sim[doc_summary_stats.df$mean_sim < quantile(doc_summary_stats.df$mean_sim, p = .05)])
```
   
  


* Token/word cosine similarity (cosine similarity with tf-idf for robustness):

```{r}
summary(doc_summary_stats.df$token_sim)
```


```{r fig.cap = 'High similarity sentences by DOCX (with median)'}
doc_summary_stats.df %>%
  ggplot2::ggplot(aes(x = high_sim_perc)) +
  geom_density() +
  geom_vline(xintercept = median(doc_summary_stats.df$high_sim_perc))
```

```{r fig.cap = 'Mean similarity score by DOCX'}
doc_summary_stats.df %>%
  ggplot(aes(x = mean_sim)) +
  geom_density()
```

```{r echo = F}
ggsave(filename = "fig/mean-sim-score.png", width = 6, height = 4, 
       doc_summary_stats.df %>%
         ggplot(aes(x = mean_sim)) +
         geom_density() + 
         theme_bw() + labs(x = "document-level cosine similarity"))
```


```{r fig.cap = 'Median similarity score by DOCX'}
doc_summary_stats.df %>%
  ggplot(aes(x = median_sim)) +
  geom_density()
```

```{r fig.cap = 'Token cosine similarity'}
doc_summary_stats.df %>%
  ggplot(aes(x = token_sim)) +
  geom_density()
```

```{r}
doc_summary_stats.df %>%
  ggplot(aes(x = mean_sim, token_sim)) + 
  geom_point() +
  labs(x = "Mean sentence similarity", "Token similarity")
```


```{r fig.cap = 'High similarity sentences by Tematica (median)'}
doc_summary_stats.df %>%
  dplyr::group_by(Tematica) %>% 
  dplyr::summarize(median_high_sim_perc = median(high_sim_perc)) %>%
  ggplot(aes(x = reorder(Tematica, median_high_sim_perc), y = median_high_sim_perc)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL)
```

```{r fig.cap = 'Mean similarity by Tematica'}
doc_summary_stats.df %>%
  ggplot(aes(x = reorder(Tematica, mean_sim), y = mean_sim)) +
  geom_boxplot() + 
  coord_flip() +
  labs(x = NULL)
```


```{r fig.cap = 'High similarity sentences by firmatario (median, only if Firmatario has more than one proposal)', fig.width = 10, fig.height = 12}
doc_summary_stats.df %>%
  dplyr::group_by(`Firmatario Rousseau`) %>% 
  dplyr::summarize(median_high_sim_perc = median(high_sim_perc), n = n()) %>%
  dplyr::filter(n > 1) %>%
  ggplot(aes(x = reorder(`Firmatario Rousseau`, 
                         median_high_sim_perc), y = median_high_sim_perc)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL)
```

```{r  fig.cap = 'High similarity (%) by n commenti'}
doc_summary_stats.df %>%
  ggplot(aes(x = `n commenti`, y = high_sim_perc)) +
  geom_point()
```

```{r  fig.cap = 'Mean similarity by n commenti'}
doc_summary_stats.df %>%
  ggplot(aes(x = `n commenti`, y = mean_sim)) +
  geom_point()
```

```{r  fig.cap = 'High similarity (%) by n followers twitter (log)'}
doc_summary_stats.df %>%
  ggplot(aes(x = log(`n followers twitter`), y = high_sim_perc)) +
  geom_point()
```

```{r  fig.cap = 'High similarity (%) by n risposte firmatario'}
doc_summary_stats.df %>%
  ggplot(aes(x = `n risposte firmatario`, y = high_sim_perc)) +
  geom_point()
```

```{r  fig.cap = 'Mean similarity by n risposte firmatario'}
doc_summary_stats.df %>%
  ggplot(aes(x = `n risposte firmatario`, y = mean_sim)) +
  geom_point()
```

```{r  fig.cap = 'High similarity (%) by Partito'}
doc_summary_stats.df %>%
  dplyr::mutate(Partito = case_when(
    Partito %in% c("M5S", "M5S, On. Azzurra Pia Maria Cancelleri", 
                   "M5S, On. Paolo Bernini") ~ "M5S",
    Partito %in% "M5S e altri" ~ "M5S e altri"
  )) %>%
  ggplot(aes(x = Partito, y = high_sim_perc)) +
  labs(x = NULL) + 
  geom_boxplot()
```

```{r  fig.cap = 'Mean similarity by Partito'}
doc_summary_stats.df %>%
  dplyr::mutate(Partito = case_when(
    Partito %in% c("M5S", "M5S, On. Azzurra Pia Maria Cancelleri", 
                   "M5S, On. Paolo Bernini") ~ "M5S",
    Partito %in% "M5S e altri" ~ "M5S e altri"
  )) %>%
  ggplot(aes(x = Partito, y = mean_sim)) +
  labs(x = NULL) + 
  geom_boxplot()
```

```{r fig.cap = "Mean similarity by time"}
doc_summary_stats.df %>%
  ggplot(aes(x = date, y = mean_sim)) +
  geom_point()
```

```{r}
ggsave(filename = "fig/sim-by-time.png",  width = 12, height = 4,
plot_grid(
  doc_summary_stats.df %>%
    dplyr::mutate(chamber = case_when(`Tipo presentazione` == "Presentato al Senato" ~ "Upper",
                                      `Tipo presentazione` == "Presentato alla Camera" ~ "Lower")) %>%
    dplyr::filter(!is.na(chamber)) %>%
    ggplot(aes(x = date, y = mean_sim)) +
    geom_point(aes( shape = chamber)) + 
    geom_smooth() +
    scale_y_continuous(limits = c(.5,1)) +
    theme_bw() + labs(y = "document-level cosine similarity") + guides(shape = "none") +
        labs(x = "day"),
  doc_summary_stats.df %>%
    dplyr::mutate(chamber = case_when(`Tipo presentazione` == "Presentato al Senato" ~ "Upper",
                                      `Tipo presentazione` == "Presentato alla Camera" ~ "Lower")) %>%
    dplyr::filter(!is.na(chamber)) %>%
    ggplot(aes(x = `n commenti`, y = mean_sim)) +
    geom_point(aes( shape = chamber)) + 
    geom_smooth() +
    scale_x_continuous(trans=scales::pseudo_log_trans(base = 10),
                       breaks = c(0, 10, 100, 1000, 2000, 3000)) +
    scale_y_continuous(limits = c(.5,1)) +
    theme_bw() + labs(y = "document-level cosine similarity") +
    theme(axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y =  element_blank()) + 
    labs(x = "number of comments"),
  ncol = 2)
       )
```


```{r fig.cap = "Token similarity by time"}
doc_summary_stats.df %>%
  ggplot(aes(x = date, y = token_sim)) +
  geom_point()
```

```{r results = 'asis'}
doc_summary_stats.df %>%
  dplyr::mutate(quantile = factor(case_when(bottom_5pct ~ "bottom 5%",
                                            bottom_10pct ~ "bottom 5-10%",
                                            !bottom_5pct & !bottom_10pct ~ 'top 90%'),
                                  levels = c("bottom 5%", "bottom 5-10%", 'top 90%'))) %>%
  dplyr::group_by(quantile) %>%
  dplyr::summarise(`0+ responses` = sum(`n risposte firmatario` > 0, na.rm = T) / n(),
                   `1+ responses` = sum(`n risposte firmatario` > 1, na.rm = T) / n(),
                   `5+ responses` = sum(`n risposte firmatario` > 5, na.rm = T) / n(),
                   `mean responses` = mean(`n risposte firmatario`, na.rm = T)) %>%
  kbl(digits=4, format = 'latex', booktabs=T)
```

```{r}
doc_summary_stats.df %>%
  dplyr::summarise(`0+ responses` = sum(`n risposte firmatario` > 0, na.rm = T) / n(),
                   `1+ responses` = sum(`n risposte firmatario` > 1, na.rm = T) / n(),
                   `5+ responses` = sum(`n risposte firmatario` > 5, na.rm = T) / n(),
                   `mean responses` = mean(`n risposte firmatario`, na.rm = T)) %>%
  kbl(digits=4, format = 'latex', booktabs=T)
```


```{r}
doc_summary_stats.df$time_period.n <- 
  factor(dplyr::case_when(doc_summary_stats.df$date <= as.Date("2018-03-22") ~ "XVII leg.",
                   doc_summary_stats.df$date > as.Date("2018-03-22") &
                      doc_summary_stats.df$date <=  as.Date("2019-09-05") ~ "XVIII leg. - Conte I",
                   doc_summary_stats.df$date >  as.Date("2019-09-05") ~ "XVIII leg. - Conte II"),
         ordered = F,
         levels = c("XVII leg.",  
                    "XVIII leg. - Conte I",
                    "XVIII leg. - Conte II"))
```


## Regression

```{r}
fit1 <- 
  lm(mean_sim ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, 
     data = doc_summary_stats.df)

fit1a <- 
  glm(bottom_5pct ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, family = "binomial", 
     data = doc_summary_stats.df)

fit1b <- 
  glm(bottom_10pct ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, family = "binomial", 
     data = doc_summary_stats.df)

fit2 <- 
  lm(token_sim ~ 
       Partito.n +
       `n commenti` + Legislatura + 
       `n risposte firmatario` +
       as.numeric(`Data presentazione`) +
       log(`n followers twitter`), data = doc_summary_stats.df)
```

```{r results = 'asis'}
library(stargazer)
stargazer(fit1, fit1a, fit1b)
```

```{r include = FALSE}
library(stargazer)
stargazer(fit1, fit1a, type = "html", out = "fig/regression-table.html")
```

```{r}
library(tidyverse)
library(cowplot)
library(broom)

ggsave(filename = "fig/coefplot.png", width = 8, height = 4, 
       plot_grid(
         fit1 %>%
           tidy(conf.int = TRUE) %>%
           filter(!grepl("Intercept|Partito.nM5S|time_period", term)) %>%
           # reorder the coefficients so that the largest is at the top of the plot
           mutate(term = case_when(term == "sentences" ~ "n sentences",
                                   term == "`n commenti`" ~ "n comments",
                                   term == "`n risposte firmatario`" ~ "n responses")) %>%
           # mutate(term = fct_reorder(term, estimate)) %>%
           ggplot(aes(estimate, term)) +
           geom_point() +
           geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
           # add in a dotted line at zero
           geom_vline(xintercept = 0, lty = 2) +
           labs(
             x = "Document-level similarity OLS",
             y = NULL
           ),
         
         fit1a %>%
           tidy(conf.int = TRUE) %>%
           filter(!grepl("Intercept|Partito.nM5S|time_period", term)) %>%
           # reorder the coefficients so that the largest is at the top of the plot
           mutate(term = case_when(term == "sentences" ~ "n sentences",
                                   term == "`n commenti`" ~ "n comments",
                                   term == "`n risposte firmatario`" ~ "n responses")) %>%
           # mutate(term = fct_reorder(term, estimate)) %>%
           ggplot(aes(estimate, term)) +
           geom_point() +
           geom_errorbar(aes(xmin = conf.low, xmax = conf.high)) +
           # add in a dotted line at zero
           geom_vline(xintercept = 0, lty = 2) +
           labs(
             x = "Least similar documents (Bottom 5%)\nlogistic regression",
             y = NULL
           ),
         ncol = 2))

```


\clearpage

## Full results

Below in the table, for each document the number of sentences in the DOCX and the number of sentences with a high similarity in the PDF, where the cosine similarity of at least one sentence pair (DOCX-PDF) was above .95. 

```{r resul = 'asis'}

doc_summary_stats.df %>%
  dplyr::select(doc_id:high_sim_perc) %>%
  kbl(digits=2, format = 'latex', booktabs=T, longtable = T) %>%
  kable_styling(latex_options = c("repeat_header", "striped"))
  
```

# Figure 1

```{r}
require(tidyverse)
ggsave(filename = "fig/fig1.png", width = 9, height = 4,
       read.csv("fig1.csv") %>%
         dplyr::mutate(month = as.Date(paste0("15/", month), format = "%d/%m/%Y")) %>%
         ggplot(aes(x = month, y = X.on.Rousseau)) +
         geom_line() +
         scale_y_continuous(limits = c(-9,100)) + 
         geom_segment(x = as.Date("2013-03-01"), xend = as.Date("2013-03-01"),
                      y = Inf, yend = -8, colour = 'grey', size = .4, linetype = 2) +
         annotate("text", x = as.Date("2015-12-01"), y = -7, label = "Opposition") + 
         annotate("text", x = as.Date("2018-10-01") + 50, y = -7, label = "Government\n(Conte 1)") + 
         annotate("text", x = as.Date("2020-01-01") + 55, y = -7, label = "Government\n(Conte 2)") + 
         geom_segment(x = as.Date("2018-04-01"), xend = as.Date("2018-04-01"),
                      y = Inf, yend = -8, colour = 'grey', size = .4, linetype = 2) +
         geom_segment(x = as.Date("2019-08-01"), xend = as.Date("2019-08-01"),
                      y = Inf, yend = -8, colour = 'grey', size = .4, linetype = 2) +
         geom_segment(x = as.Date("2020-09-01"), xend = as.Date("2020-09-01"),
                      y = Inf, yend = -8, colour = 'grey', size = .4, linetype = 2) +
         labs(x = NULL, y = "%") +
         theme_bw())

```

# Figure 2

```{r}
require(tidyverse)
ggsave(filename = "fig/fig2.png", width = 9, height = 6,
       read.csv("fig2.csv") %>%
         tidyr::pivot_longer(cols = Avg.n..of.comments:Avg..N..of.interactions) %>%
         dplyr::mutate(name = factor(case_when(grepl("interactions", name) ~ "Average n. of interactions",
                                               grepl("comments", name) ~ "Average n. of comments"), 
                                     ordered = T, 
                                     levels = c("Average n. of interactions", 
                                                "Average n. of comments"))) %>%
         ggplot(aes(x = year, y = value)) +
         geom_histogram(stat = 'identity') +
         facet_wrap("name", ncol = 1, scales = "free") +
         labs(x = NULL, y = NULL) +
         theme_bw())

## 
require(scales)
ggsave(filename = "fig/fig2_alttake.png", width = 9, height = 6,
       doc_summary_stats.df %>%
         # dplyr::mutate(month = as.Date(format(date, "%Y-%m-15"))) %>%
         # dplyr::group_by(month) %>%
         # dplyr::summarise(`Average n. of comments` = mean(`n commenti`)) %>%
         dplyr::mutate(`n. interactions` = `n risposte firmatario`,
                       `n. comments` = `n commenti`) %>%
         tidyr::pivot_longer(cols = `n. interactions`:`n. comments`) %>%
         ggplot(aes(x = date, y = value)) +
         geom_point() +
         geom_smooth(se = FALSE) + 
         scale_y_continuous(trans = pseudo_log_trans()) +
         labs(x = NULL, y = NULL) +
         theme_bw() + 
         facet_wrap("name", ncol = 1, scales = "free"))

```


## References

* Benoit, K., & Obeng, A. (2023). readtext: Import and handling for plain and formatted text files [Manual]. https://CRAN.R-project.org/package=readtext

* Edoardo Federici. (2022). Sentence-bert-base, sentence-transformer for Italian. *Hugging Face*. https://doi.org/10.57967/hf/0112

* Silge, J., & Robinson, D. (2016). tidytext: Text mining and analysis using tidy data principles in R. *JOSS*, 1(3). https://doi.org/10.21105/joss.00037

* Reimers, N., & Gurevych, I. (2019, November). Sentence-BERT: Sentence embeddings using siamese BERT-Networks. *Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing*. https://arxiv.org/abs/1908.10084

